{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "693a8e01",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/ms-en/resolve/main/ms-en-left.train\n",
    "# !wget https://huggingface.co/datasets/mesolitica/ms-en/resolve/main/ms-en-right.train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bf16e4eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b57ef08a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow_addons/utils/ensure_tf_install.py:53: UserWarning: Tensorflow Addons supports using Python ops for all Tensorflow versions above or equal to 2.3.0 and strictly below 2.5.0 (nightly versions are not supported). \n",
      " The versions of TensorFlow you are currently using is 2.6.0 and is not supported. \n",
      "Some things might work, some things might not.\n",
      "If you were to encounter a bug, do not file an issue.\n",
      "If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. \n",
      "You can find the compatibility matrix in TensorFlow Addon's readme:\n",
      "https://github.com/tensorflow/addons\n",
      "  warnings.warn(\n",
      "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow_addons/utils/resource_loader.py:72: UserWarning: You are currently using TensorFlow 2.6.0 and trying to load a custom op (custom_ops/seq2seq/_beam_search_ops.so).\n",
      "TensorFlow Addons has compiled its custom ops against TensorFlow 2.4.0, and there are no compatibility guarantees between the two versions. \n",
      "This means that you might get segfaults when loading the custom op, or other kind of low-level errors.\n",
      " If you do, do not file an issue on Github. This is a known limitation.\n",
      "\n",
      "It might help you to fallback to pure Python ops with TF_ADDONS_PY_OPS . To do that, see https://github.com/tensorflow/addons#gpucpu-custom-ops \n",
      "\n",
      "You can also change the TensorFlow version installed on your system. You would need a TensorFlow version equal to or above 2.4.0 and strictly below 2.5.0.\n",
      " Note that nightly versions of TensorFlow, as well as non-pip TensorFlow like `conda install tensorflow` or compiled from source are not supported.\n",
      "\n",
      "The last solution is to find the TensorFlow Addons version that has custom ops compatible with the TensorFlow installed on your system. To do that, refer to the readme: https://github.com/tensorflow/addons\n",
      "  warnings.warn(\n",
      "/home/ubuntu/.local/lib/python3.8/site-packages/malaya_boilerplate/frozen_graph.py:35: UserWarning: Cannot import beam_search_ops from Tensorflow Addons, ['malaya.jawi_rumi.deep_model', 'malaya.phoneme.deep_model', 'malaya.rumi_jawi.deep_model', 'malaya.stem.deep_model'] will not available to use, make sure Tensorflow Addons version >= 0.12.0\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from glob import glob\n",
    "import json\n",
    "import tensorflow as tf\n",
    "import malaya"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4bfd7f6f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
     ]
    }
   ],
   "source": [
    "fast_text = malaya.language_detection.fasttext()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8034ffc9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['augmented-ms-en-v2.json',\n",
       " 'augmented-ms-en-3.json',\n",
       " 'augmented-ms-en-2.json',\n",
       " 'augmented-ms-en-v3.json',\n",
       " 'augmented-ms-en-1.json']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "augmented = glob('augmented-ms-en-*.json')\n",
    "augmented = [f for f in augmented if 'test' not in f]\n",
    "augmented"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fac3e186",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 84178/84178 [00:00<00:00, 2642270.58it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 335872/335872 [00:00<00:00, 2557029.31it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 138131/138131 [00:00<00:00, 2533467.17it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 419750/419750 [00:00<00:00, 2766708.68it/s]\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 273070/273070 [00:00<00:00, 2531599.37it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "lefts, rights = [], []\n",
    "\n",
    "for file in augmented:\n",
    "    with open(file) as fopen:\n",
    "        data = json.load(fopen)\n",
    "\n",
    "    for i in tqdm(range(len(data['ms']))):\n",
    "        if len(data['ms'][i]) and len(data['en'][i]):\n",
    "            lefts.append(data['ms'][i])\n",
    "            rights.append(data['en'][i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "92cdbab6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 3252225/3252225 [00:00<00:00, 3641086.37it/s]\n"
     ]
    }
   ],
   "source": [
    "with open('ms-en-left.train') as fopen:\n",
    "    left = fopen.read().split('\\n')\n",
    "\n",
    "with open('ms-en-right.train') as fopen:\n",
    "    right = fopen.read().split('\\n')\n",
    "\n",
    "\n",
    "for i in tqdm(range(len(left))):\n",
    "    if len(left[i]) and len(right[i]):\n",
    "        lefts.append(left[i])\n",
    "        rights.append(right[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "8bb46eff",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = string.replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7fab6f1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf t5-noisy-ms-en"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "bea20e50",
   "metadata": {},
   "outputs": [],
   "source": [
    "!mkdir t5-noisy-ms-en"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "00279b17",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils import shuffle\n",
    "\n",
    "lefts, rights = shuffle(lefts, rights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f868b2c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 500000\n",
    "for i in range(0, len(lefts), batch_size):\n",
    "    b_left = lefts[i: i + batch_size]\n",
    "    b_right = rights[i: i + batch_size]\n",
    "    with tf.io.gfile.GFile(f't5-noisy-ms-en/{i}.tsv', 'w') as outfile:\n",
    "        for k in range(len(b_left)):\n",
    "            l = cleaning(b_left[k])\n",
    "            r = cleaning(b_right[k])\n",
    "            outfile.write(\"%s\\t%s\\n\" % (l, r))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "df549a1f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['t5-noisy-ms-en/1500000.tsv',\n",
       " 't5-noisy-ms-en/2500000.tsv',\n",
       " 't5-noisy-ms-en/0.tsv',\n",
       " 't5-noisy-ms-en/4000000.tsv',\n",
       " 't5-noisy-ms-en/3000000.tsv',\n",
       " 't5-noisy-ms-en/4500000.tsv',\n",
       " 't5-noisy-ms-en/2000000.tsv',\n",
       " 't5-noisy-ms-en/500000.tsv',\n",
       " 't5-noisy-ms-en/1000000.tsv',\n",
       " 't5-noisy-ms-en/3500000.tsv']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "glob('t5-noisy-ms-en/*.tsv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "23556d86",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "import t5\n",
    "import functools\n",
    "from t5 import models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "59b70409",
   "metadata": {},
   "outputs": [],
   "source": [
    "def translation_dataset(split, shuffle_files=False):\n",
    "    del shuffle_files\n",
    "    ds = tf.data.TextLineDataset(glob('t5-noisy-ms-en/*.tsv'))\n",
    "\n",
    "    ds = ds.map(\n",
    "        functools.partial(\n",
    "            tf.io.decode_csv,\n",
    "            record_defaults=['', ''],\n",
    "            field_delim='\\t',\n",
    "            use_quote_delim=False,\n",
    "        ),\n",
    "        num_parallel_calls=tf.data.experimental.AUTOTUNE,\n",
    "    )\n",
    "    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))\n",
    "    return ds\n",
    "\n",
    "\n",
    "def translation_preprocessor(ds):\n",
    "    def to_inputs_and_targets(ex):\n",
    "        return {\n",
    "            'inputs': tf.strings.join(['terjemah Melayu ke Inggeris: ', ex['question']]),\n",
    "            'targets': ex['answer'],\n",
    "        }\n",
    "\n",
    "    return ds.map(\n",
    "        to_inputs_and_targets,\n",
    "        num_parallel_calls=tf.data.experimental.AUTOTUNE,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "3bbb7b91",
   "metadata": {},
   "outputs": [],
   "source": [
    "import seqio\n",
    "\n",
    "DEFAULT_SPM_PATH = vocab = 'sp10m.cased.ms-en.model'\n",
    "DEFAULT_EXTRA_IDS = 100\n",
    "\n",
    "\n",
    "def get_default_vocabulary():\n",
    "    return seqio.SentencePieceVocabulary(DEFAULT_SPM_PATH, DEFAULT_EXTRA_IDS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "5b387cac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<seqio.dataset_providers.Mixture at 0x7fc230897250>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "t5.data.TaskRegistry.remove('translation_dataset')\n",
    "\n",
    "t5.data.TaskRegistry.add(\n",
    "    'translation_dataset',\n",
    "    dataset_fn=translation_dataset,\n",
    "    splits=['train'],\n",
    "    text_preprocessor=[translation_preprocessor],\n",
    "    postprocess_fn=t5.data.postprocessors.lower_text,\n",
    "    metric_fns=[t5.evaluation.metrics.accuracy],\n",
    "    output_features = seqio.Feature(get_default_vocabulary())\n",
    ")\n",
    "t5.data.MixtureRegistry.remove('translation_bahasa')\n",
    "t5.data.MixtureRegistry.add(\n",
    "    'translation_bahasa',\n",
    "    ['translation_dataset'],\n",
    "    default_rate=1.0,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "09305e73",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-07-06 16:22:22.222941: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected\n",
      "2022-07-06 16:22:22.222966: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: huseincomel-desktop\n",
      "2022-07-06 16:22:22.222970: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: huseincomel-desktop\n",
      "2022-07-06 16:22:22.223035: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program\n",
      "2022-07-06 16:22:22.223053: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.129.6\n",
      "2022-07-06 16:22:22.223216: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "nq_task = t5.data.TaskRegistry.get(\"translation_dataset\")\n",
    "ds = nq_task.get_dataset(split='knowledge-graph.tsv', sequence_length={\"inputs\": 512, \"targets\": 512})\n",
    "r = tfds.as_numpy(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "69c837df",
   "metadata": {},
   "outputs": [],
   "source": [
    "r = r._make_iterator_fn()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "a4163adc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-07-06 16:22:22.954282: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'inputs_pretokenized': b'terjemah Melayu ke Inggeris: This because ketika tree itu was kecil and sudah ngeluarin buah, saya hanya menggunakan baja organik he,\" katanya.',\n",
       " 'inputs': array([   13, 26087,  1550,    55,  2040,    31,   263,   229,   123,\n",
       "         7421,    37,    39,   439,    20,   391,    13, 14056, 12995,\n",
       "          153,  1508,    14,    67,   169,   311, 18383, 13555,    57,\n",
       "           14,     6,   194,     3,     1], dtype=int32),\n",
       " 'targets_pretokenized': b'This is because when the tree was young and the fruit was out, I used only organic fertilizers, \"he said.',\n",
       " 'targets': array([  263,    26,   229,   146,    15,  7421,    39,  1169,    20,\n",
       "           15,  7307,    39,   131,    14,    59,   419,   220, 14017,\n",
       "        24112,    16,    14,    13,     6,   609,    49,     3,     1],\n",
       "       dtype=int32)}"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cb0a09f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
